# =========================
# Windows-safe LDA Pipeline
# =========================

import os
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from gensim import corpora
from gensim.models import CoherenceModel, LdaMulticore
from nltk.stem import WordNetLemmatizer
from multiprocessing import freeze_support

# =========================
# 全局参数（可在论文中说明）
# =========================
TEXT_DIR = "Text Samples"
STOPWORDS_FILE = "stopname.txt" 
PHRASES_FILE = "phrase.txt"

NUM_TOPICS_RANGE = range(2, 11)
NUM_WORDS = 30
PASSES = 500
WORKERS = 8
RANDOM_STATE = 42

# =========================
# 文本预处理函数
# =========================
def preprocess_documents(documents, stopwords, phrases):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    lemmatizer = WordNetLemmatizer()

    processed_texts = []

    for doc in documents:
        for p in phrases:
            doc = doc.replace(p.replace("_", " "), p)

        tokens = tokenizer.tokenize(doc.lower())
        tokens = [
            lemmatizer.lemmatize(t)
            for t in tokens
            if t not in stopwords and t not in phrases
        ]

        tokens.extend([p for p in phrases if p in doc])
        processed_texts.append(tokens)

    return processed_texts

# =========================
# 单一年份 LDA 分析
# =========================
def run_lda_for_year(txt_path, stopwords, phrases, output_dir):
    year = os.path.basename(txt_path)[:4]

    with open(txt_path, "r", encoding="utf-8") as f:
        documents = f.readlines()

    texts = preprocess_documents(documents, stopwords, phrases)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(t) for t in texts]

    metrics = []
    topic_records = []

    for k in NUM_TOPICS_RANGE:
        lda = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=k,
            passes=PASSES,
            workers=WORKERS,
            random_state=RANDOM_STATE
        )

        perplexity = 2 ** (-lda.log_perplexity(corpus))
        coherence = CoherenceModel(
            model=lda,
            texts=texts,
            dictionary=dictionary,
            coherence="c_v"
        ).get_coherence()

        metrics.append({
            "Year": year,
            "NumTopics": k,
            "Perplexity": perplexity,
            "Coherence": coherence
        })

        for topic_id in range(k):
            topic_terms = lda.get_topic_terms(topicid=topic_id, topn=NUM_WORDS)
            for word_id, prob in topic_terms:
                topic_records.append({
                    "Year": year,
                    "NumTopics": k,
                    "TopicID": topic_id,
                    "Word": dictionary[word_id],
                    "Probability": prob
                })

    metrics_df = pd.DataFrame(metrics)
    topics_df = pd.DataFrame(topic_records)

    # =========================
    # 保存表格
    # =========================
    metrics_df.to_excel(
        os.path.join(output_dir, f"{year}_LDA_Metrics.xlsx"),
        index=False
    )

    with pd.ExcelWriter(
        os.path.join(output_dir, f"{year}_LDA_Topics.xlsx"),
        engine="xlsxwriter"
    ) as writer:
        for k in NUM_TOPICS_RANGE:
            sheet = topics_df[topics_df["NumTopics"] == k]
            sheet.to_excel(writer, sheet_name=f"{k}_Topics", index=False)

    # =========================
    # 保存图像（300 dpi）
    # =========================
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(metrics_df["NumTopics"], metrics_df["Perplexity"])
    plt.xlabel("Number of Topics")
    plt.ylabel("Perplexity")
    plt.title(f"{year} Perplexity")

    plt.subplot(1, 2, 2)
    plt.plot(metrics_df["NumTopics"], metrics_df["Coherence"])
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence (c_v)")
    plt.title(f"{year} Coherence")

    plt.tight_layout()
    plt.savefig(
        os.path.join(output_dir, f"{year}_LDA_Evaluation.png"),
        dpi=300
    )
    plt.close()

# =========================
# 主程序（Windows 必须）
# =========================
def main():
    with open(STOPWORDS_FILE, "r", encoding="utf-8") as f:
        stopwords = set(f.read().splitlines())

    with open(PHRASES_FILE, "r", encoding="utf-8") as f:
        phrases = [line.strip().replace(" ", "_") for line in f]

    output_dir = "LDA_Results"
    os.makedirs(output_dir, exist_ok=True)

    for file in os.listdir(TEXT_DIR):
        if file.endswith("reports.txt"):
            run_lda_for_year(
                txt_path=os.path.join(TEXT_DIR, file),
                stopwords=stopwords,
                phrases=phrases,
                output_dir=output_dir
            )

# =========================
# Windows multiprocessing 关键
# =========================
if __name__ == "__main__":
    freeze_support()
    main()
